/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    ReplaceMissingWithUserConstant.java
 *    Copyright (C) 2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.unsupervised.attribute;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.core.WeightedAttributesHandler;
import weka.core.WeightedInstancesHandler;
import weka.filters.StreamableFilter;
import weka.filters.UnsupervisedFilter;

/**
 * <!-- globalinfo-start --> Replaces all missing values for nominal, string,
 * numeric and date attributes in the dataset with user-supplied constant
 * values.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class ReplaceMissingWithUserConstant extends PotentialClassIgnorer implements UnsupervisedFilter, StreamableFilter, EnvironmentHandler, WeightedInstancesHandler, WeightedAttributesHandler {

    /** For serialization */
    private static final long serialVersionUID = -7334039452189350356L;

    /** Environment variables */
    protected transient Environment m_env;

    /** Range of columns to consider */
    protected Range m_selectedRange;

    protected String m_range = "first-last";

    protected String m_resolvedRange = "";

    /** Constant for replacing missing values in nominal/string atts with */
    protected String m_nominalStringConstant = "";

    /**
     * Replacement value for nominal/string atts after resolving environment vars
     */
    protected String m_resolvedNominalStringConstant = "";

    /** Constant for replacing missing values in numeric attributes with */
    protected String m_numericConstant = "0";

    /** Replacement value for numeric atts after resolving environment vars */
    protected String m_resolvedNumericConstant = "";

    /** Parsed numeric constant value */
    protected double m_numericConstVal = 0;

    /** Constant for replacing missing values in date attributes with */
    protected String m_dateConstant = "";

    /** Replacement value for date atts after resolving environment vars */
    protected String m_resolvedDateConstant = "";

    /** Parsed date value as a double */
    protected double m_dateConstVal = 0;

    /** Formatting string to use for parsing the date constant */
    protected String m_defaultDateFormat = "yyyy-MM-dd'T'HH:mm:ss";

    /** Formatting string after resolving environment vars */
    protected String m_resolvedDateFormat = "";

    /**
     * Returns a string describing this filter
     * 
     * @return a description of the filter suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {

        return "Replaces all missing values for nominal, string, numeric and date " + "attributes in the dataset with user-supplied constant values.";
    }

    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enableAllAttributes();
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enableAllClasses();
        result.enable(Capability.MISSING_CLASS_VALUES);
        result.enable(Capability.NO_CLASS);

        return result;
    }

    @Override
    public Enumeration<Option> listOptions() {

        Vector<Option> opts = new Vector<Option>(5);

        opts.addElement(new Option("\tSpecify list of attributes to replace missing values for " + "\n\t(as weka range list of indices or a comma separated list of attribute names).\n" + "\t(default: consider all attributes)", "R", 1, "-A <index1,index2-index4,... | att-name1,att-name2,...>"));

        opts.addElement(new Option("\tSpecify the replacement constant for nominal/string attributes", "N", 1, "-N"));
        opts.addElement(new Option("\tSpecify the replacement constant for numeric attributes" + "\n\t(default: 0)", "R", 1, "-R"));
        opts.addElement(new Option("\tSpecify the replacement constant for date attributes", "D", 1, "-D"));
        opts.addElement(new Option("\tSpecify the date format for parsing the replacement date constant" + "\n\t(default: yyyy-MM-dd'T'HH:mm:ss)", "F", 1, "-F"));

        opts.addAll(Collections.list(super.listOptions()));

        return opts.elements();
    }

    /**
     * 
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     * -A &lt;index1,index2-index4,... | att-name1,att-name2,...&gt;
     *  Specify list of attributes to replace missing values for 
     *  (as weka range list of indices or a comma separated list of attribute names).
     *  (default: consider all attributes)
     * </pre>
     * 
     * <pre>
     * -N
     *  Specify the replacement constant for nominal/string attributes
     * </pre>
     * 
     * <pre>
     * -R
     *  Specify the replacement constant for numeric attributes
     *  (default: 0)
     * </pre>
     * 
     * <pre>
     * -D
     *  Specify the replacement constant for date attributes
     * </pre>
     * 
     * <pre>
     * -F
     *  Specify the date format for parsing the replacement date constant
     *  (default: yyyy-MM-dd'T'HH:mm:ss)
     * </pre>
     * 
     * <pre>
     * -unset-class-temporarily
     *  Unsets the class index temporarily before the filter is
     *  applied to the data.
     *  (default: no)
     * </pre>
     * 
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {

        String atts = Utils.getOption('A', options);
        if (atts.length() > 0) {
            setAttributes(atts);
        }

        String nomString = Utils.getOption('N', options);
        if (nomString.length() > 0) {
            setNominalStringReplacementValue(nomString);
        }
        String numString = Utils.getOption('R', options);
        if (numString.length() > 0) {
            setNumericReplacementValue(numString);
        }
        String dateString = Utils.getOption('D', options);
        if (dateString.length() > 0) {
            setDateReplacementValue(dateString);
        }
        String formatString = Utils.getOption('F', options);
        if (formatString.length() > 0) {
            setDateFormat(formatString);
        }

        super.setOptions(options);

        Utils.checkForRemainingOptions(options);
    }

    @Override
    public String[] getOptions() {

        ArrayList<String> options = new ArrayList<String>();

        if (getAttributes().length() > 0) {
            options.add("-A");
            options.add(getAttributes());
        }

        if (getNominalStringReplacementValue().length() > 0) {
            options.add("-N");
            options.add(getNominalStringReplacementValue());
        }

        if (getNumericReplacementValue().length() > 0) {
            options.add("-R");
            options.add(getNumericReplacementValue());
        }

        if (getDateReplacementValue().length() > 0) {
            options.add("-D");
            options.add(getDateReplacementValue());
        }

        if (getDateFormat().length() > 0) {
            options.add("-F");
            options.add(getDateFormat());
        }

        Collections.addAll(options, super.getOptions());

        return options.toArray(new String[1]);
    }

    /**
     * Tip text for this property suitable for displaying in the GUI.
     * 
     * @return the tip text for this property.
     */
    public String attributesTipText() {
        return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\". Can alternatively" + " specify a comma separated list of attribute names. Note that " + " you can't mix indices and attribute names in the same list";
    }

    /**
     * Set the list of attributes to consider for replacing missing values
     * 
     * @param range the list of attributes to consider
     */
    public void setAttributes(String range) {
        m_range = range;
    }

    /**
     * Get the list of attributes to consider for replacing missing values
     * 
     * @return the list of attributes to consider
     */
    public String getAttributes() {
        return m_range;
    }

    /**
     * Tip text for this property suitable for displaying in the GUI.
     * 
     * @return the tip text for this property.
     */
    public String nominalStringReplacementValueTipText() {
        return "The constant to replace missing values in nominal/string attributes with";
    }

    /**
     * Get the nominal/string replacement value
     * 
     * @return the nominal/string replacement value
     */
    public String getNominalStringReplacementValue() {
        return m_nominalStringConstant;
    }

    /**
     * Set the nominal/string replacement value
     * 
     * @param nominalStringConstant the nominal/string constant to use
     */
    public void setNominalStringReplacementValue(String nominalStringConstant) {
        m_nominalStringConstant = nominalStringConstant;
    }

    /**
     * Tip text for this property suitable for displaying in the GUI.
     * 
     * @return the tip text for this property.
     */
    public String numericReplacementValueTipText() {
        return "The constant to replace missing values in numeric attributes with";
    }

    /**
     * Get the numeric replacement value
     * 
     * @return the numeric replacement value
     */
    public String getNumericReplacementValue() {
        return m_numericConstant;
    }

    /**
     * Set the numeric replacement value
     * 
     * @param numericConstant the numeric replacement value
     */
    public void setNumericReplacementValue(String numericConstant) {
        m_numericConstant = numericConstant;
    }

    /**
     * Tip text for this property suitable for displaying in the GUI.
     * 
     * @return the tip text for this property.
     */
    public String dateReplacementValueTipText() {
        return "The constant to replace missing values in date attributes with";
    }

    /**
     * Set the date replacement value
     * 
     * @param dateConstant the date replacement value
     */
    public void setDateReplacementValue(String dateConstant) {
        m_dateConstant = dateConstant;
    }

    /**
     * Get the date replacement value
     * 
     * @return the date replacement value
     */
    public String getDateReplacementValue() {
        return m_dateConstant;
    }

    /**
     * Tip text for this property suitable for displaying in the GUI.
     * 
     * @return the tip text for this property.
     */
    public String dateFormatTipText() {
        return "The formatting string to use for parsing the date replacement value";
    }

    /**
     * Set the date format to use for parsing the date replacement constant
     * 
     * @param dateFormat the date format to use
     */
    public void setDateFormat(String dateFormat) {
        m_defaultDateFormat = dateFormat;
    }

    /**
     * Get the date format to use for parsing the date replacement constant
     * 
     * @return the date format to use
     */
    public String getDateFormat() {
        return m_defaultDateFormat;
    }

    @Override
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        super.setInputFormat(instanceInfo);

        m_resolvedNominalStringConstant = m_nominalStringConstant;
        m_resolvedNumericConstant = m_numericConstant;
        m_resolvedDateConstant = m_dateConstant;
        m_resolvedDateFormat = m_defaultDateFormat;
        m_resolvedRange = m_range;

        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        try {
            if (m_resolvedNominalStringConstant != null && m_resolvedNominalStringConstant.length() > 0) {
                m_resolvedNominalStringConstant = m_env.substitute(m_resolvedNominalStringConstant);
            }

            if (m_resolvedNumericConstant != null && m_resolvedNumericConstant.length() > 0) {
                m_resolvedNumericConstant = m_env.substitute(m_resolvedNumericConstant);
            }

            if (m_resolvedDateConstant != null && m_resolvedDateConstant.length() > 0) {
                m_resolvedDateConstant = m_env.substitute(m_resolvedDateConstant);
            }

            if (m_resolvedDateFormat != null && m_resolvedDateFormat.length() > 0) {
                m_resolvedDateFormat = m_env.substitute(m_resolvedDateFormat);
            }

            if (m_resolvedRange != null && m_resolvedRange.length() > 0) {
                m_resolvedRange = m_env.substitute(m_resolvedRange);
            }
        } catch (Exception ex) {
        }

        // try and set up a Range first directly from the supplied string
        m_selectedRange = new Range(m_resolvedRange);
        try {
            m_selectedRange.setUpper(instanceInfo.numAttributes() - 1);
        } catch (IllegalArgumentException e) {
            // now try as a list of named attributes
            String[] parts = m_resolvedRange.split(",");
            if (parts.length == 0) {
                throw new Exception("Must specify which attributes to replace missing values for!");
            }

            StringBuffer indexList = new StringBuffer();
            for (String att : parts) {
                att = att.trim();
                Attribute a = instanceInfo.attribute(att);
                if (a == null) {
                    throw new Exception("I can't find the requested attribute '" + att + "' in the incoming instances.");
                }
                indexList.append(",").append(a.index() + 1);
            }
            String result = indexList.toString();
            result = result.substring(1, result.length());
            m_selectedRange = new Range(result);
            m_selectedRange.setUpper(instanceInfo.numAttributes() - 1);
        }

        boolean hasNominal = false;
        boolean hasString = false;
        boolean hasNumeric = false;
        boolean hasDate = false;

        for (int i = 0; i < instanceInfo.numAttributes(); i++) {
            if (m_selectedRange.isInRange(i)) {
                if (instanceInfo.attribute(i).isNominal()) {
                    hasNominal = true;
                } else if (instanceInfo.attribute(i).isString()) {
                    hasString = true;
                } else if (instanceInfo.attribute(i).isDate()) {
                    hasDate = true;
                } else if (instanceInfo.attribute(i).isNumeric()) {
                    hasNumeric = true;
                }
            }
        }

        if (hasNominal || hasString) {
            if (m_resolvedNominalStringConstant == null || m_resolvedNominalStringConstant.length() == 0) {
                if (m_resolvedNumericConstant != null && m_resolvedNumericConstant.length() > 0) {
                    // use the supplied numeric constant as a nominal value
                    m_resolvedNominalStringConstant = "" + m_resolvedNumericConstant;
                } else {
                    throw new Exception("Data contains nominal/string attributes and no " + "replacement constant has been supplied");
                }
            }
        }

        if (hasNumeric) {
            if (m_numericConstant == null || m_numericConstant.length() == 0) {
                if (m_resolvedNominalStringConstant != null && m_resolvedNominalStringConstant.length() > 0) {
                    // use the supplied nominal constant as numeric replacement
                    // value (if we can parse it as a number)
                    try {
                        Double.parseDouble(m_resolvedNominalStringConstant);
                        m_resolvedNumericConstant = m_resolvedNominalStringConstant;
                    } catch (NumberFormatException e) {
                        throw new Exception("Data contains numeric attributes and no numeric " + "constant has been supplied. Unable to parse nominal " + "constant as a number either.");
                    }
                } else {
                    throw new Exception("Data contains numeric attributes and no " + "replacement constant has been supplied");
                }
            }

            try {
                m_numericConstVal = Double.parseDouble(m_resolvedNumericConstant);
            } catch (NumberFormatException e) {
                throw new Exception("Unable to parse numeric constant");
            }
        }

        if (hasDate) {
            if (m_resolvedDateConstant == null || m_resolvedDateConstant.length() == 0) {
                throw new Exception("Data contains date attributes and no replacement constant has been " + "supplied");
            }

            SimpleDateFormat sdf = new SimpleDateFormat(m_resolvedDateFormat);
            Date d = sdf.parse(m_resolvedDateConstant);
            m_dateConstVal = d.getTime();
        }

        Instances outputFormat = new Instances(instanceInfo, 0);

        // check for nominal attributes and add the supplied constant to the
        // list of legal values (if necessary)
        ArrayList<Attribute> updatedNoms = new ArrayList<Attribute>();
        for (int i = 0; i < instanceInfo.numAttributes(); i++) {
            if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) {
                Attribute temp = instanceInfo.attribute(i);
                if (temp.isNominal()) {
                    if (temp.indexOfValue(m_resolvedNominalStringConstant) < 0) {
                        List<String> values = new ArrayList<String>();

                        values.add(m_resolvedNominalStringConstant);
                        for (int j = 0; j < temp.numValues(); j++) {
                            values.add(temp.value(j));
                        }

                        Attribute newAtt = new Attribute(temp.name(), values);
                        newAtt.setWeight(temp.weight());
                        updatedNoms.add(newAtt);
                    }
                }
            }
        }

        if (updatedNoms.size() > 0) {
            int nomCount = 0;
            ArrayList<Attribute> atts = new ArrayList<Attribute>();

            for (int i = 0; i < instanceInfo.numAttributes(); i++) {
                if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) {
                    if (instanceInfo.attribute(i).isNominal()) {
                        atts.add(updatedNoms.get(nomCount++));
                    } else {
                        atts.add(instanceInfo.attribute(i)); // Copy not necessary
                    }
                } else {
                    atts.add(instanceInfo.attribute(i)); // Copy not necessary
                }
            }

            outputFormat = new Instances(instanceInfo.relationName(), atts, 0);
            outputFormat.setClassIndex(getInputFormat().classIndex());
        }

        setOutputFormat(outputFormat);

        return true;
    }

    @Override
    public boolean input(Instance inst) throws Exception {
        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (m_NewBatch) {
            resetQueue();
            m_NewBatch = false;
        }

        double[] vals = new double[inst.numAttributes()];

        for (int i = 0; i < inst.numAttributes(); i++) {
            if (inst.isMissing(i) && m_selectedRange.isInRange(i)) {
                if (i != inst.classIndex()) {
                    if (inst.attribute(i).isDate()) {
                        vals[i] = m_dateConstVal;
                    } else if (inst.attribute(i).isNumeric()) {
                        vals[i] = m_numericConstVal;
                    } else if (inst.attribute(i).isNominal()) {
                        // vals[i] = inst.attribute(i).numValues();
                        int temp = inst.attribute(i).indexOfValue(m_resolvedNominalStringConstant);
                        // vals[i] = (temp >= 0) ? temp : inst.attribute(i).numValues();
                        vals[i] = (temp >= 0) ? temp : 0;
                    } else if (inst.attribute(i).isString()) {
                        // a bit of a hack here to try and detect if we're running in
                        // streaming
                        // mode or batch mode. If the string attribute has only one value in
                        // the
                        // header then it is likely that we're running in streaming mode
                        // (where only one
                        // value, the current instance's value, is maintained in memory)
                        if (inst.attribute(i).numValues() <= 1) {
                            outputFormatPeek().attribute(i).setStringValue(m_resolvedNominalStringConstant);
                            vals[i] = 0;
                        } else {
                            vals[i] = outputFormatPeek().attribute(i).addStringValue(m_resolvedNominalStringConstant);
                        }
                    } else {
                        vals[i] = inst.value(i);
                    }
                } else {
                    vals[i] = inst.value(i);
                }
            } else {
                if (m_selectedRange.isInRange(i)) {
                    // in range but not missing
                    if (inst.attribute(i).isString()) {
                        // a bit of a hack here to try and detect if we're running in
                        // streaming
                        // mode or batch mode. If the string attribute has only one value in
                        // the
                        // header then it is likely that we're running in streaming mode
                        // (where only one
                        // value, the current instance's value, is maintained in memory)
                        if (inst.attribute(i).numValues() <= 1) {
                            outputFormatPeek().attribute(i).setStringValue(inst.stringValue(i));
                        } else {
                            outputFormatPeek().attribute(i).addStringValue(inst.stringValue(i));
                        }
                        vals[i] = outputFormatPeek().attribute(i).indexOfValue(inst.stringValue(i));
                    } else if (inst.attribute(i).isNominal() && i != inst.classIndex()) {
                        // vals[i] = inst.value(i) + 1;
                        vals[i] = outputFormatPeek().attribute(i).indexOfValue(inst.stringValue(i));
                    } else {
                        vals[i] = inst.value(i);
                    }
                } else {
                    // missing but not in range
                    vals[i] = inst.value(i);
                }
            }
        }

        Instance newInst = null;
        if (inst instanceof SparseInstance) {
            newInst = new SparseInstance(inst.weight(), vals);
        } else {
            newInst = new DenseInstance(inst.weight(), vals);
        }

        newInst.setDataset(getOutputFormat());
        /*
         * copyValues(newInst, false, inst.dataset(), getOutputFormat());
         * newInst.setDataset(getOutputFormat());
         */
        push(newInst, false); // No need to copy

        return true;
    }

    @Override
    public void setEnvironment(Environment env) {
        m_env = env;
    }

    /**
     * Main method for testing this class.
     * 
     * @param args should contain arguments to the filter: use -h for help
     */
    public static void main(String[] args) {
        runFilter(new ReplaceMissingWithUserConstant(), args);
    }
}
